import os
import pandas as pd
# ====== Preprocessing ==========================
# Read raw data
master_dir = os.path.dirname(os.getcwd())
# Google
file_google = os.path.join(master_dir, "data_cleaned",
"GoogleReview_data_cleaned.csv")
# TripAdvisor
file_tripadvisor = os.path.join(master_dir, "data_cleaned", "TripAdvisor_data_cleaned.csv")
df_google = pd.read_csv(file_google)
df_TA = pd.read_csv(file_tripadvisor)
# combine reviews
df = pd.concat([df_TA, df_google], axis=0)
df_ori = df.copy()
# number of records
sampleNum = df.index.size
print(f"Sample Number: {sampleNum}")
Sample Number: 361784
# number of restaurants
uniqueRestaurants = df['Restaurant'].unique().size
print(f"Unique Restaurants: {uniqueRestaurants}")
Unique Restaurants: 3515
# Subset wanted coluns
df = df[['Review', 'Rating']]
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
# Adding labels
def labelFunc(x):
if int(x) > 3:
return "Positive"
elif int(x) < 3:
return "Neutral"
elif int(x) == 3:
return "Negative"
# Add sentiment label
df_ori["Label"] = df_ori["Rating"].apply(labelFunc)
df_ori["Location"].value_counts()
KL 98750 Petaling Jaya 49932 Penang 44502 JB 35238 Ipoh 30796 Kuching 29603 Langkawi 29303 Melaka 26690 Shah Alam 8658 Miri 8312 Name: Location, dtype: int64
df_ori["Location"].value_counts().plot(kind='bar',
figsize=(10, 8),
ylabel="No. of reviews",
title="Review Distribution",
cmap='summer',
);
Highest on review on Kuala Lumpur, lowest on Miri.
# Rating Distribution
df_ori["Rating"].value_counts().plot(kind='bar',
figsize=(8, 6),
ylabel="No. of reviews",
title="Rating Distribution");
# Average Rating
avgRating = df_ori["Rating"].mean().round(2)
print(f"Average Rating: {avgRating}")
Average Rating: 4.18
Rating sckewed towards the high (Positive sentiment) side, as the average rating is 4.18.
# Average rating across different locations
df_ori.pivot_table(index='Location', values='Rating', aggfunc='mean') \
.sort_values('Rating', ascending=False) \
.plot(kind='bar', ylabel="Average Rating",
title="Rating Distribution", figsize=(10, 5), cmap='jet'
);
Langkawi on average has the highest average review rating, whereas the lowest is Melaka.
df_ori["Label"].value_counts().plot(kind='bar',
figsize=(8, 6),
ylabel="No. of reviews",
title="Sentiment Class Distribution");
# Proportion of sentiment type
pd.DataFrame(df_ori["Label"].value_counts() / df_ori.index.size * 100) \
.rename(columns={'Label': "Percentage"})
| Percentage | |
|---|---|
| Positive | 80.087842 |
| Negative | 11.010714 |
| Neutral | 8.901444 |
More than 80% of the reviews are positive reviews, whereas the neutral and negative rewviews have around 10% each
averageRating_restaurants = df_ori[["Restaurant", "Rating"]] \
.pivot_table(index=['Restaurant'], values='Rating', aggfunc='mean') \
.sort_values('Rating', ascending=False)
averageRating_restaurants.head(10)
| Rating | |
|---|---|
| Restaurant | |
| Spiced Pumpkin Cafe | 5.0 |
| Baan Carabao | 5.0 |
| Latte Be Light | 5.0 |
| Ayam Penyet | 5.0 |
| Restaurant Pin Wei Seafood | 5.0 |
| Restaurant Smk Corner | 5.0 |
| Restaurant Soon Tong | 5.0 |
| Thirty8 Fashion | 5.0 |
| Canard eat & roll | 5.0 |
| Restoran Bendang | 5.0 |
May not be reflective, as some restaurant with high rating may have very less review
# get the frequency of each restaurant
restaurant_reviewcount = pd.DataFrame(df_ori["Restaurant"].value_counts())
restaurant_reviewcount = restaurant_reviewcount.reset_index().rename(columns={'index': "Restaurant", 'Restaurant': 'Review Count'})
# sort by review count first, then average rating
overall_toprestaurants = averageRating_restaurants \
.reset_index().merge(restaurant_reviewcount, on='Restaurant') \
.sort_values(by=['Review Count', 'Rating'], ascending=False).head(10)
overall_toprestaurants
| Restaurant | Rating | Review Count | |
|---|---|---|---|
| 406 | Dining In The Dark KL | 4.672455 | 2073 |
| 819 | Ishin Japanese Dining | 4.459525 | 2063 |
| 876 | Khan’s Indian Cuisine | 4.429253 | 1887 |
| 2625 | Hard Rock Cafe | 3.826692 | 1581 |
| 933 | Wonderland Food Store | 4.403743 | 1496 |
| 2739 | Geographer Cafe | 3.753274 | 1451 |
| 965 | BBQ NIGHTS | 4.398628 | 1312 |
| 361 | The Whisky Bar | 4.735202 | 1284 |
| 245 | Canopy Rooftop Bar and Lounge | 4.880126 | 1268 |
| 2600 | Madam Kwan's KLCC | 3.843849 | 1268 |
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(x=overall_toprestaurants['Restaurant'],
y=overall_toprestaurants['Rating'], name="Rating", mode="lines"),
secondary_y=True
)
fig.add_trace(
go.Bar(x=overall_toprestaurants['Restaurant'],
y=overall_toprestaurants['Review Count'], name="Review Count"),
secondary_y=False
)
fig.update_xaxes(title_text="Review Count")
# Set y-axes titles
fig.update_yaxes(title_text="Review Count", secondary_y=False)
fig.update_yaxes(title_text="Rating", secondary_y=True)
fig.show()
import nltk
tokenizer = nltk.RegexpTokenizer(r"\w+")
df_ori['Word_count'] = df_ori['Review'].apply(lambda x: len(tokenizer.tokenize(x)))
import matplotlib.pyplot as plt
plt.scatter(x=df_ori['Word_count'], y=df_ori['Rating'], alpha=0.02)
plt.xlabel('Word count')
plt.ylabel('Rating')
plt.title('Relationship between rating and word count');
df_ori[['Word_count', 'Rating']].corr()
| Word_count | Rating | |
|---|---|---|
| Word_count | 1.000000 | -0.111255 |
| Rating | -0.111255 | 1.000000 |
No direct correlationship can be observed. Only slightly negatively correlated, in which longer review tends to be lower rating
import datetime
import re
import matplotlib.pyplot as plt
def dateconvert(x, startdate):
match = re.search(r"(\bweek(s)?\b)|(\bday(s)?\b)", x)
if match:
delta = int(re.search("\d+", x)[0])
# weeks
if re.search(r"\bweek(s)?\b", x):
date = startdate - datetime.timedelta(weeks=delta)
# days
elif re.search(r"\bday(s)?\b", x):
date = startdate - datetime.timedelta(days=delta)
elif x in "yesterday":
date = startdate - datetime.timedelta(days=1)
elif x in "today":
date = startdate
else:
date = datetime.datetime.strptime(x, "%d %B %Y")
return date
# Review Timeline
df_TA["Date"] = df_TA["Dates"].str.replace(r"Reviewed\s", "", regex=True).str.strip()
df_TA["Date"] = df_TA["Date"].apply(dateconvert, startdate=datetime.date(2022, 4, 30));
plt.figure(figsize=(15, 10))
plt.plot(df_TA.set_index("Date")["Review"].resample("M").count())
plt.xlabel('Year')
plt.ylabel('Number of reviews per month')
plt.title('Reviews from TripAdvisor')
plt.show()
# Data between Dec'2019 to Apr'2022
import datetime
ts_data = df_TA.set_index("Date")["Review"].resample("M").count()
ts_data = ts_data[(ts_data.index > "2019-11-01" )& (ts_data.index < "2022-05-01") ]
ts_data.index = ts_data.reset_index().Date.apply(lambda x: datetime.datetime.strftime(x, "%b'%Y"))
pd.DataFrame(ts_data)
| Review | |
|---|---|
| Date | |
| Nov'2019 | 1963 |
| Dec'2019 | 2484 |
| Jan'2020 | 2060 |
| Feb'2020 | 1841 |
| Mar'2020 | 1105 |
| Apr'2020 | 98 |
| May'2020 | 176 |
| Jun'2020 | 500 |
| Jul'2020 | 814 |
| Aug'2020 | 857 |
| Sep'2020 | 830 |
| Oct'2020 | 747 |
| Nov'2020 | 539 |
| Dec'2020 | 738 |
| Jan'2021 | 375 |
| Feb'2021 | 305 |
| Mar'2021 | 663 |
| Apr'2021 | 626 |
| May'2021 | 204 |
| Jun'2021 | 65 |
| Jul'2021 | 61 |
| Aug'2021 | 140 |
| Sep'2021 | 377 |
| Oct'2021 | 597 |
| Nov'2021 | 524 |
| Dec'2021 | 641 |
| Jan'2022 | 613 |
| Feb'2022 | 322 |
| Mar'2022 | 46 |
| Apr'2022 | 578 |
location_pivot = df_TA.pivot_table(index=[df_TA["Date"].dt.year, df_TA["Date"].dt.month],
columns="Location",
aggfunc="count")["Author"]
location_pivot.plot(figsize=(20, 10),
xlabel="(Year, month)",
ylabel="Number of reviews per month",
title="Reviews from TripAdvisor breakdown by Location");
It could be observe that the number of reviews dropped significanlty on the beginning of year 2020, which could corresponding to the start of COVID-19 pandemic in Malaysia.
The up and downs after the COVID-19 may corresponds to the MCO executed by the Malaysia government.
# df of each category
df_positive = df[df["Rating"] > 3]
df_neutral = df[df["Rating"] == 3]
df_negative = df[df["Rating"] < 3]
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
stopwordz = set(stopwords.words("English"))
stopwordz.remove('not') # remove negative
# Unigram
# overall
texts = " ".join(review.lower() for review in df["Review"])
# remove punctuation
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(texts)
# remove stopwords
tokens = [token for token in tokens if token not in stopwordz]
overall_unigram = nltk.FreqDist(tokens).most_common(20)
overall_unigram
overallUnigram_df = pd.DataFrame(overall_unigram, columns=['Unigram', 'Count'])
overallUnigram_df
| Unigram | Count | |
|---|---|---|
| 0 | food | 208145 |
| 1 | good | 145511 |
| 2 | not | 87382 |
| 3 | service | 83760 |
| 4 | place | 82728 |
| 5 | great | 71729 |
| 6 | nice | 67433 |
| 7 | restaurant | 64852 |
| 8 | staff | 46725 |
| 9 | delicious | 41794 |
| 10 | friendly | 38565 |
| 11 | one | 37525 |
| 12 | best | 37105 |
| 13 | really | 36711 |
| 14 | price | 35100 |
| 15 | chicken | 34627 |
| 16 | time | 33902 |
| 17 | taste | 32343 |
| 18 | like | 31600 |
| 19 | try | 31507 |
# overall
wordcloudz = WordCloud(stopwords=set(stopwordz)).generate(" ".join(tokens))
plt.figure(figsize = (12,8))
plt.imshow(wordcloudz, interpolation='bilinear');
texts_positive = " ".join(review.lower() for review in df_positive["Review"])
# remove punctuation
tokenizer_positive = nltk.RegexpTokenizer(r"\w+")
tokens_positive = tokenizer.tokenize(texts_positive)
# remove stopwords
tokens_positive = [token for token in tokens_positive if token not in stopwordz]
positive_unigram = nltk.FreqDist(tokens_positive).most_common(20)
# top positive unigram
positiveUnigram_df = pd.DataFrame(positive_unigram, columns=['Unigram', 'Count'])
positiveUnigram_df
| Unigram | Count | |
|---|---|---|
| 0 | food | 162607 |
| 1 | good | 121507 |
| 2 | place | 68247 |
| 3 | great | 66088 |
| 4 | service | 65491 |
| 5 | nice | 57890 |
| 6 | restaurant | 49845 |
| 7 | not | 45764 |
| 8 | delicious | 39303 |
| 9 | staff | 37799 |
| 10 | friendly | 35393 |
| 11 | best | 34125 |
| 12 | really | 29673 |
| 13 | one | 28039 |
| 14 | try | 27042 |
| 15 | price | 25554 |
| 16 | also | 25530 |
| 17 | chicken | 25426 |
| 18 | time | 25147 |
| 19 | well | 24551 |
plt.barh(positiveUnigram_df['Unigram'], width=positiveUnigram_df["Count"]);
plt.xlabel('Frequency');
plt.ylabel('Unigram');
# positive
wordcloudz_positive = WordCloud(stopwords=set(stopwordz)).generate(texts_positive)
plt.figure(figsize = (12,8))
plt.imshow(wordcloudz_positive, interpolation='bilinear');
# neutral
texts_neutral = " ".join(review.lower() for review in df_neutral["Review"])
# remove punctuation
tokenizer_neutral = nltk.RegexpTokenizer(r"\w+")
tokens_neutral = tokenizer.tokenize(texts_neutral)
# remove stopwords
tokens_neutral = [token for token in tokens_neutral if token not in stopwordz]
neutral_unigram = nltk.FreqDist(tokens_neutral).most_common(20)
# Top neutral unigram
neutralUnigram_df = pd.DataFrame(neutral_unigram, columns=['Unigram', 'Count'])
neutralUnigram_df
| Unigram | Count | |
|---|---|---|
| 0 | food | 24472 |
| 1 | not | 19250 |
| 2 | good | 16550 |
| 3 | place | 8685 |
| 4 | service | 8671 |
| 5 | restaurant | 7633 |
| 6 | nice | 6913 |
| 7 | taste | 5998 |
| 8 | price | 5427 |
| 9 | chicken | 5137 |
| 10 | like | 4605 |
| 11 | ok | 4552 |
| 12 | one | 4330 |
| 13 | average | 4328 |
| 14 | quite | 4131 |
| 15 | time | 4108 |
| 16 | staff | 3941 |
| 17 | great | 3934 |
| 18 | ordered | 3838 |
| 19 | bit | 3704 |
plt.barh(neutralUnigram_df['Unigram'], width=neutralUnigram_df["Count"]);
plt.xlabel('Frequency');
plt.ylabel('Unigram');
wordcloudz_neutral = WordCloud(stopwords=set(stopwordz)).generate(texts_neutral)
plt.figure(figsize = (12,8))
plt.imshow(wordcloudz_neutral, interpolation='bilinear');
# negative
texts_negative = " ".join(review.lower() for review in df_negative["Review"])
# remove punctuation
tokenizer_negative = nltk.RegexpTokenizer(r"\w+")
tokens_negative = tokenizer.tokenize(texts_negative)
# remove stopwords
tokens_negative = [token for token in tokens_negative if token not in stopwordz]
negative_unigram = nltk.FreqDist(tokens_negative).most_common(20)
negativeUnigram_df = pd.DataFrame(negative_unigram, columns=['Unigram', 'Count'])
negativeUnigram_df
| Unigram | Count | |
|---|---|---|
| 0 | not | 22368 |
| 1 | food | 21066 |
| 2 | service | 9598 |
| 3 | good | 7454 |
| 4 | restaurant | 7374 |
| 5 | place | 5796 |
| 6 | us | 5569 |
| 7 | one | 5156 |
| 8 | like | 5029 |
| 9 | taste | 5029 |
| 10 | staff | 4985 |
| 11 | ordered | 4947 |
| 12 | bad | 4769 |
| 13 | time | 4647 |
| 14 | order | 4345 |
| 15 | even | 4144 |
| 16 | price | 4119 |
| 17 | chicken | 4064 |
| 18 | table | 3658 |
| 19 | came | 3548 |
plt.barh(negativeUnigram_df['Unigram'], width=negativeUnigram_df["Count"]);
plt.xlabel('Frequency');
plt.ylabel('Unigram');
wordcloudz_negative = WordCloud(stopwords=set(stopwordz)).generate(texts_negative)
plt.figure(figsize = (12,8))
plt.imshow(wordcloudz_negative, interpolation='bilinear');
# breakdown by each sentiment
unigram_combined_df_each = pd.concat([
positiveUnigram_df.rename(columns={'Unigram': 'Positive'})['Positive'],
neutralUnigram_df.rename(columns={'Unigram': 'Neutral'})['Neutral'],
negativeUnigram_df.rename(columns={'Unigram': 'Negative'})['Negative']
], axis=1).head(10)
unigram_combined_df_each
| Positive | Neutral | Negative | |
|---|---|---|---|
| 0 | food | food | not |
| 1 | good | not | food |
| 2 | place | good | service |
| 3 | great | place | good |
| 4 | service | service | restaurant |
| 5 | nice | restaurant | place |
| 6 | restaurant | nice | us |
| 7 | not | taste | one |
| 8 | delicious | price | like |
| 9 | staff | chicken | taste |
# overall
unigram_combined_df = pd.concat([
positiveUnigram_df,
neutralUnigram_df,
negativeUnigram_df
], axis=0).head(10)
unigram_combined_df
| Unigram | Count | |
|---|---|---|
| 0 | food | 162607 |
| 1 | good | 121507 |
| 2 | place | 68247 |
| 3 | great | 66088 |
| 4 | service | 65491 |
| 5 | nice | 57890 |
| 6 | restaurant | 49845 |
| 7 | not | 45764 |
| 8 | delicious | 39303 |
| 9 | staff | 37799 |
It could be observed that, the top common attributes are food, place, service
# Take note on memory on vectorizer, limit max features
from sklearn.feature_extraction.text import CountVectorizer
# bigram vectorizer
vect = CountVectorizer(stop_words=stopwordz, ngram_range=(2, 2), max_features=1000)
# Positive reviews
bigrams_positive = vect.fit_transform(df_positive['Review'])
bigram_positive_df = pd.DataFrame(
bigrams_positive.toarray(), columns=vect.get_feature_names_out())
bigram_frequency = pd.DataFrame(bigram_positive_df.sum(axis=0)).reset_index()
bigram_frequency.columns = ['bigram', 'frequency']
bigram_frequency = bigram_frequency.sort_values(
by='frequency', ascending=False).head(20)
plt.barh(bigram_frequency["bigram"], width=bigram_frequency["frequency"]);
plt.xlabel('Frequency');
plt.ylabel('Bigram');
There are many positive bigrams can be observed, usch as "good food", "food good", "good service", "must try", "nice place" etc.
# Neutral reviews
bigrams_neutral = vect.fit_transform(df_neutral['Review'])
bigram_neutral_df = pd.DataFrame(
bigrams_neutral.toarray(), columns=vect.get_feature_names_out())
bigram_frequency = pd.DataFrame(bigram_neutral_df.sum(axis=0)).reset_index()
bigram_frequency.columns = ['bigram', 'frequency']
bigram_frequency = bigram_frequency.sort_values(
by='frequency', ascending=False).head(20)
plt.barh(bigram_frequency["bigram"], width=bigram_frequency["frequency"]);
plt.xlabel('Frequency');
plt.ylabel('Bigram');
The top bigram (food good) for both positive and neutral are the same.
This shows that the neutral sentiment class is rather ambiguous, in which some reviews with "food good" are positive reviews, but some are neutral reviews, at the same time there are more negative bigrams, such as "not great", "not good", but also within the neutral sentiment.
However, there are some more neutral bigrams notices, such as "not bad", "food ok" etc.
# %%
# negative reviews
bigrams_negative = vect.fit_transform(df_negative['Review'])
bigram_negative_df = pd.DataFrame(
bigrams_negative.toarray(), columns=vect.get_feature_names_out())
bigram_frequency = pd.DataFrame(bigram_negative_df.sum(axis=0)).reset_index()
bigram_frequency.columns = ['bigram', 'frequency']
bigram_frequency = bigram_frequency.sort_values(
by='frequency', ascending=False).head(20)
plt.barh(bigram_frequency["bigram"], width=bigram_frequency["frequency"]);
plt.xlabel('Frequency');
plt.ylabel('Bigram');
Many negative bigrams can be observed, such as "not worth", "bad service", "not good", "not recommended" and many other negate lexicons, such as "food not", "not even", and "would not"
# Trigram vectorizer
vect = CountVectorizer(stop_words=stopwordz, ngram_range=(3, 3), max_features=1000)
# Positive reviews
trigram_positive = vect.fit_transform(df_positive['Review'])
trigram_positive_df = pd.DataFrame(
trigram_positive.toarray(), columns=vect.get_feature_names_out())
trigram_frequency = pd.DataFrame(trigram_positive_df.sum(axis=0)).reset_index()
trigram_frequency.columns = ['trigram', 'frequency']
trigram_frequency = trigram_frequency.sort_values(
by='frequency', ascending=False).head(20)
plt.barh(trigram_frequency["trigram"], width=trigram_frequency["frequency"]);
If based on trigram, the service is top most common lexicons, followed by food, and then price.
# neutral reviews
trigram_neutral = vect.fit_transform(df_neutral['Review'])
trigram_neutral_df = pd.DataFrame(
trigram_neutral.toarray(), columns=vect.get_feature_names_out())
trigram_frequency = pd.DataFrame(trigram_neutral_df.sum(axis=0)).reset_index()
trigram_frequency.columns = ['trigram', 'frequency']
trigram_frequency = trigram_frequency.sort_values(
by='frequency', ascending=False).head(20)
plt.barh(trigram_frequency["trigram"], width=trigram_frequency["frequency"]);
# negative reviews
trigram_negative = vect.fit_transform(df_negative['Review'])
trigram_negative_df = pd.DataFrame(
trigram_negative.toarray(), columns=vect.get_feature_names_out())
trigram_frequency = pd.DataFrame(trigram_negative_df.sum(axis=0)).reset_index()
trigram_frequency.columns = ['trigram', 'frequency']
trigram_frequency = trigram_frequency.sort_values(
by='frequency', ascending=False).head(20)
plt.barh(trigram_frequency["trigram"], width=trigram_frequency["frequency"]);
I can be observed that most negative review will mention not worth the price, not recommended, followed by food.
# dicts of pos tags:
posTagDict = {
'CC': 'Conjunction',
'CD': 'cardinal digit',
'DT': 'determiner',
'EX': 'existential',
'FW': 'foreign word',
'IN': 'preposition/subordinating conjunction',
'JJ': 'adjective',
'JJR': 'adjective, comparative',
'JJS': 'adjective, superlative',
'LS': 'list marker',
'MD': 'modal',
'NN': 'noun, singular',
'NNS': 'noun plural',
'NNP': 'proper noun',
'NNPS': 'proper noun',
'PDT': 'predeterminer',
'POS': 'possessive ending',
'PRP': 'personal pronoun',
'PRP$': 'possessive pronoun',
'RB': 'adverb',
'RBR': 'adverb',
'RBS': 'adverb, superlative',
'RP': 'particle',
'TO': 'to',
'UH': 'interjection',
'VB': 'verb, base form',
'VBD': 'verb, past tense',
'VBG': 'verb, gerund/present',
'VBN': 'verb, past participle taken',
'VBP': 'verb, present',
'VBZ': 'verb, 3rd person',
'WDT': 'wh-determiner',
'WP': 'wh-pronoun',
'WP$': 'possessive wh-pronoun',
'WRB': 'wh-abverb where, when',
}
# pos_tagging
# Overall pos tags
df_pos = df.copy()
df_pos["tag"] = df_pos["Review"].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))
# overall
df_pos["Label"] = df_pos["Rating"].apply(labelFunc)
# separated into 3 class
df_pos_positive = df_pos[df_pos["Label"] == "Positive"]
df_pos_neutral = df_pos[df_pos["Label"] == "Neutral"]
df_pos_negative = df_pos[df_pos["Label"] == "Negative"]
# POS tagging example for overall
df_pos[['Review', 'tag']]
| Review | tag | |
|---|---|---|
| 0 | David, Thanga, Mikail and Chef Steven gave bri... | [(David, NNP), (,, ,), (Thanga, NNP), (,, ,), ... |
| 1 | We visited for family celebration and as usual... | [(We, PRP), (visited, VBD), (for, IN), (family... |
| 2 | Perfect dinner after a long journey.After trav... | [(Perfect, NNP), (dinner, NN), (after, IN), (a... |
| 3 | Had a great Tomahawk for me and hubby thanks t... | [(Had, VBD), (a, DT), (great, JJ), (Tomahawk, ... |
| 4 | Excellent ambient. Excellent service by David,... | [(Excellent, JJ), (ambient, NN), (., .), (Exce... |
| ... | ... | ... |
| 339690 | Delicious food. I passed by this place while s... | [(Delicious, JJ), (food, NN), (., .), (I, PRP)... |
| 339691 | To put it mildly, Horrible food, Horrible cust... | [(To, TO), (put, VB), (it, PRP), (mildly, RB),... |
| 339692 | Good vegetarian selection and the food was gre... | [(Good, JJ), (vegetarian, JJ), (selection, NN)... |
| 339693 | Simple atmosphere with fresh air (on a rainy n... | [(Simple, NN), (atmosphere, RB), (with, IN), (... |
| 339694 | Very good Cheese Naan! | [(Very, RB), (good, JJ), (Cheese, NNP), (Naan,... |
339695 rows × 2 columns
# combine all into 1 list
pos_tags_overall = [item for row in df_pos["tag"] for item in row]
pos_tags_positive = [item for row in df_pos_positive["tag"] for item in row]
pos_tags_neutral = [item for row in df_pos_neutral["tag"] for item in row]
pos_tags_negative = [item for row in df_pos_negative["tag"] for item in row]
# Top POS for each category
def top_pos(tag, pos_tags):
words = [word[0] for word in pos_tags if word[1].startswith(tag)]
# remove punctuations
words_joined = " ".join(word.lower() for word in words)
tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(words_joined)
# remove stop words
new_words = [word for word in new_words if word not in stopwordz]
pos_freq = nltk.FreqDist(new_words)
return pos_freq.most_common(20)
overall_unigram_pos = overallUnigram_df['Unigram'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x))[0])
overall_df = pd.DataFrame.from_records(overall_unigram_pos, columns=['Lexicon', 'POS Tag'])
overall_df['POS Tag'] = overall_df['POS Tag'].apply(lambda x: posTagDict.get(x))
overall_df.head(10)
| Lexicon | POS Tag | |
|---|---|---|
| 0 | food | noun, singular |
| 1 | good | adjective |
| 2 | not | adverb |
| 3 | service | noun, singular |
| 4 | place | noun, singular |
| 5 | great | adjective |
| 6 | nice | adjective |
| 7 | restaurant | noun, singular |
| 8 | staff | noun, singular |
| 9 | delicious | adjective |
# nouns
nouns_overall = top_pos("NN", pos_tags_overall)
nouns_df = pd.DataFrame.from_records(nouns_overall, columns=['Lexicon', 'Frequency'])
nouns_df.head(10) # top 10
| Lexicon | Frequency | |
|---|---|---|
| 0 | food | 207237 |
| 1 | service | 82878 |
| 2 | place | 81919 |
| 3 | restaurant | 63547 |
| 4 | staff | 46335 |
| 5 | price | 34803 |
| 6 | time | 33630 |
| 7 | chicken | 29190 |
| 8 | taste | 27895 |
| 9 | dishes | 27026 |
adjs_overall = top_pos("JJ", pos_tags_overall)
adjs_df = pd.DataFrame.from_records(adjs_overall, columns=['Lexicon', 'Frequency'])
adjs_df.head(10) # top 10
| Lexicon | Frequency | |
|---|---|---|
| 0 | good | 141340 |
| 1 | great | 54911 |
| 2 | nice | 50057 |
| 3 | delicious | 36035 |
| 4 | best | 32191 |
| 5 | friendly | 31002 |
| 6 | excellent | 22123 |
| 7 | fresh | 16482 |
| 8 | tasty | 16339 |
| 9 | reasonable | 15614 |
verbs_overall = top_pos("V", pos_tags_overall)
verbs_df = pd.DataFrame.from_records(verbs_overall, columns=['Lexicon', 'Frequency'])
verbs_df.head(10) # top 10
| Lexicon | Frequency | |
|---|---|---|
| 0 | try | 25966 |
| 1 | go | 21647 |
| 2 | ordered | 21448 |
| 3 | come | 20373 |
| 4 | get | 18513 |
| 5 | love | 17978 |
| 6 | served | 17060 |
| 7 | recommend | 15934 |
| 8 | recommended | 14079 |
| 9 | eat | 13496 |
adverbs_overall = top_pos("RB", pos_tags_overall)
adverbs_df = pd.DataFrame.from_records(adverbs_overall, columns=['Lexicon', 'Frequency'])
adverbs_df.head(10) # top 10
| Lexicon | Frequency | |
|---|---|---|
| 0 | not | 87976 |
| 1 | really | 36064 |
| 2 | n | 35899 |
| 3 | also | 30446 |
| 4 | well | 25090 |
| 5 | back | 22039 |
| 6 | definitely | 18895 |
| 7 | even | 17041 |
| 8 | quite | 16156 |
| 9 | always | 12185 |
adverbs_df.head(11)
| Lexicon | Frequency | |
|---|---|---|
| 0 | not | 87976 |
| 1 | really | 36064 |
| 2 | n | 35899 |
| 3 | also | 30446 |
| 4 | well | 25090 |
| 5 | back | 22039 |
| 6 | definitely | 18895 |
| 7 | even | 17041 |
| 8 | quite | 16156 |
| 9 | always | 12185 |
| 10 | still | 11730 |
def pos_table(df, pos_tags):
nouns = top_pos("NN", pos_tags)
verbs = top_pos("V", pos_tags)
adjs = top_pos("JJ", pos_tags)
adverbs = top_pos("RB", pos_tags)
# Put into df
pos_list = [
(nouns, "Noun"),
(verbs, "Verb"),
(adjs, "Adjective"),
(adverbs, "Adverb"),
]
data = []
for pos, pos_name in pos_list:
df = pd.DataFrame({pos_name: [word[0] for word in pos]})
data.append(df)
df = pd.concat(data, axis=1)
return df
pos_overall = pos_table(df_pos, pos_tags_overall)
pos_overall
| Noun | Verb | Adjective | Adverb | |
|---|---|---|---|---|
| 0 | food | try | good | not |
| 1 | service | go | great | really |
| 2 | place | ordered | nice | n |
| 3 | restaurant | come | delicious | also |
| 4 | staff | get | best | well |
| 5 | price | love | friendly | back |
| 6 | time | served | excellent | definitely |
| 7 | chicken | recommend | fresh | even |
| 8 | taste | recommended | tasty | quite |
| 9 | dishes | eat | reasonable | always |
| 10 | menu | came | many | still |
| 11 | dinner | went | little | especially |
| 12 | rice | made | local | however |
| 13 | experience | make | small | ever |
| 14 | quality | tried | first | never |
| 15 | lunch | find | indian | friendly |
| 16 | order | enjoyed | expensive | highly |
| 17 | meal | visit | big | lovely |
| 18 | great | want | bad | much |
| 19 | nice | take | much | pretty |
The top common lexicons for nouns, verbs, adjectives, and adverb, are shown on top
pos_positive = pos_table(df_pos_positive, pos_tags_positive)
pos_positive
| Noun | Verb | Adjective | Adverb | |
|---|---|---|---|---|
| 0 | food | try | good | not |
| 1 | place | love | great | really |
| 2 | service | come | nice | also |
| 3 | restaurant | go | delicious | n |
| 4 | staff | recommend | best | well |
| 5 | price | ordered | friendly | back |
| 6 | time | get | excellent | definitely |
| 7 | chicken | recommended | tasty | quite |
| 8 | dishes | served | reasonable | even |
| 9 | dinner | eat | fresh | always |
| 10 | menu | made | many | especially |
| 11 | taste | enjoyed | local | still |
| 12 | great | went | little | friendly |
| 13 | experience | came | indian | ever |
| 14 | rice | make | attentive | highly |
| 15 | nice | tried | amazing | lovely |
| 16 | lunch | visit | first | however |
| 17 | meal | find | big | never |
| 18 | ambience | enjoy | authentic | atmosphere |
| 19 | kl | loved | small | much |
pos_neutral = pos_table(df_pos_neutral, pos_tags_neutral)
pos_neutral
| Noun | Verb | Adjective | Adverb | |
|---|---|---|---|---|
| 0 | food | ordered | good | not |
| 1 | service | came | bad | n |
| 2 | restaurant | served | expensive | even |
| 3 | place | go | nice | really |
| 4 | staff | get | small | back |
| 5 | time | asked | better | also |
| 6 | taste | come | last | never |
| 7 | order | told | many | still |
| 8 | price | went | poor | well |
| 9 | chicken | said | much | however |
| 10 | table | got | first | quite |
| 11 | quality | eat | great | ever |
| 12 | menu | took | fresh | better |
| 13 | rice | take | average | definitely |
| 14 | waiter | serve | disappointed | almost |
| 15 | experience | arrived | terrible | instead |
| 16 | dishes | know | high | much |
| 17 | dinner | make | worst | away |
| 18 | meal | want | ok | extremely |
| 19 | minutes | made | slow | later |
pos_negative = pos_table(df_pos_negative, pos_tags_negative)
pos_negative
| Noun | Verb | Adjective | Adverb | |
|---|---|---|---|---|
| 0 | food | ordered | good | not |
| 1 | place | get | nice | n |
| 2 | service | go | great | quite |
| 3 | restaurant | served | average | really |
| 4 | price | try | ok | also |
| 5 | taste | came | small | however |
| 6 | chicken | come | bad | well |
| 7 | time | went | expensive | even |
| 8 | staff | eat | little | still |
| 9 | dishes | think | better | back |
| 10 | rice | find | many | better |
| 11 | bit | tried | much | rather |
| 12 | menu | say | best | pretty |
| 13 | order | fried | friendly | especially |
| 14 | quality | serve | local | much |
| 15 | dinner | take | delicious | maybe |
| 16 | lunch | got | high | definitely |
| 17 | table | tasted | fresh | always |
| 18 | nothing | need | tasty | enough |
| 19 | drinks | want | reasonable | slightly |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# download necessary package
# nltk.download('vader_lexicon');
# Run lexicon polarity
polarityAnalyzer = SentimentIntensityAnalyzer()
df_polarity = df.copy()
df_polarity['Polarity_raw'] = df_polarity['Review'].apply(lambda x: polarityAnalyzer.polarity_scores(x))
df_polarity['Compound'] = df_polarity['Polarity_raw'].apply(lambda x: x['compound'])
df_polarity['Positive'] = df_polarity['Polarity_raw'].apply(lambda x: x['pos'])
df_polarity['Neutral'] = df_polarity['Polarity_raw'].apply(lambda x: x['neu'])
df_polarity['Negative'] = df_polarity['Polarity_raw'].apply(lambda x: x['neg'])
# compound score distribution
df_polarity['Compound'].plot(kind='hist', ylabel="Frequency", title="Compound Score Distribution", bins=50);
df_polarity[['Compound', 'Positive', 'Neutral', 'Negative']].mean().plot(kind='bar', ylabel="Avg Score",
title="Average Polarity Score");